###
### check balance of T / C groups on demographic dimensions
### run gen_t_c_groups.R first
###

library(tidyverse)
library(parallel)
library(hms)
library(broom)
library(ggthemes)
library(data.table)
library(lubridate)


### SET WORKING DIRECTORY HERE ###
path_to_archive <- "replication/"
data_dir <- paste0(path_to_archive, "data/")
setwd(data_dir)
plot_dir <- paste0(path_to_archive, "plots/")
tables_dir <- paste0(path_to_archive, "tables/")

demos_to_test <- c("hh_size","n_adults","n_generations", "married","has_children","n_children","own_home","home_value","n_cars","vehicle_year","net_worth","income","age","white","asian","black","hispanic","sf_home","male","college","rep","dem")
names(demos_to_test) <- demos_to_test

# demographics - measured once
### THIS FILE IS PROPRIETARY (FROM FWM DATA)
demo <- readRDS("stb_demo_20130131.rds")
###

### first, table values against representative sample
demo_table <- demo %>% as.data.table %>% .[,map(.SD, mean, na.rm=T), .SDcols = demos_to_test]

### comparison group: mediamark gfk data (THIS FILE IS PROPRIETARY)
gfk <- fread("doublebase2012.csv") %>%
	.[,`:=` (white = `[07071]: Race: White`,
			 black = `[07072]: Black/African American`,
			 asian = `[07074]: Asian`, 
			 hispanic = `[08401]: Spanish Or Hispanic Origin Or Descent: Yes`)] %>%
	.[,hh_size:= case_when(`[04091]: No. Of People In HH: 1 Person`==1 ~ 1,
						   `[04092]: 2 People`==1 ~ 2,
						   `[04093]: 3 People`==1 ~ 3,
						   `[04094]: 4 People`==1 ~ 4,
						   `[04095]: 5 People`==1 ~ 5,
						   `[04096]: 6 People`==1 ~ 6,
						   `[04097]: 7 People`==1 ~ 7,
						   `[04098]: 8 People`==1 ~ 8,
						   `[04099]: 9 People`==1 ~ 9,
						   `[0409x]: 10+ People`==1 ~ 10)] %>%
	.[,n_children:= case_when(`[04130]: No. Of Children Currently Living At Home 0-17*: 0 Children`==1 ~ 0,
						   `[04131]: 1 Child`==1 ~ 1,
						   `[04132]: 2 Children`==1 ~ 2,
						   `[04133]: 3 Children`==1 ~ 3,
						   `[04134]: 4 Children`==1 ~ 4,
						   `[04135]: 5 Children`==1 ~ 5,
						   `[04136]: 6 Children`==1 ~ 6,
						   `[04137]: 7 Children`==1 ~ 7,
						   `[04138]: 8 Children`==1 ~ 8,
						   `[04139]: 9+ Children`==1 ~ 9)] %>%
	.[,n_adults:= case_when(`[04121]: No. Of Adults In HH (18+): 1 Adult`==1 ~ 1,
						   `[04122]: 2 Adults`==1 ~ 2,
						   `[04123]: 3 Adults`==1 ~ 3,
						   `[04124]: 4 Adults`==1 ~ 4,
						   `[04125]: 5 Adults`==1 ~ 5,
						   `[04126]: 6 Adults`==1 ~ 6,
						   `[04127]: 7 Adults`==1 ~ 7,
						   `[04128]: 8 Adults`==1 ~ 8,
						   `[04129]: 9+ Adults`==1 ~ 9)] %>%
	.[,married := `[04342]: Now Married`] %>%
	.[,has_children := as.numeric(n_children > 0)] %>%
	.[,own_home := `[08211]: Own Or Rent Home: Own`] %>%
	.[,home_value := case_when(`[61401]: Value Of Owned Home: 0-49999 Dollars` ==1 ~ 25, 
							`[61402]: 50000-74999 Dollars` ==1 ~ 62.5,
							`[61403]: 75000-99999 Dollars` ==1 ~ 87.5,
							`[61404]: 100000-124999 Dollars` ==1 ~ 112.5,
							`[61405]: 125000-149999 Dollars` ==1 ~ 137.5,
							`[61406]: 150000-199999 Dollars` ==1 ~ 175,
							`[61407]: 200000-249999 Dollars` ==1 ~ 225,
							`[61408]: 250000-299999 Dollars` ==1 ~ 275,
							`[61409]: 300000-399999 Dollars` ==1 ~ 350,
							`[61400]: 400000-499999 Dollars` ==1 ~ 450,
							`[6140x]: 500000-749999 Dollars` ==1 ~ 625,
							`[6140y]: 750000+ Dollars` ==1 ~ 1000)] %>%
	.[,n_cars := case_when(`[199111]: Automobiles And Other Vehicles - Number Owned Or Leased: 1` == 1 ~ 1,
						  `[199112]: 2` ==1 ~ 2,
						  `[199113]: 3` ==1 ~ 3,
						  `[199114]: 4` ==1 ~ 4,
						  `[199115]: 5+` ==1 ~ 5,
						  `[19911Y]: Any` == 0 ~ 0)] %>%
	.[,vehicle_year := case_when(`[207792]: 2011: Net Any Vehicle (currently owned/leased)` == 1 ~ 2011,
								 `[207793]: 2010: Net Any Vehicle (currently owned/leased)` == 1 ~ 2010,
								 `[207794]: 2009: Net Any Vehicle (currently owned/leased)` == 1 ~ 2009,
								 `[207795]: 2008: Net Any Vehicle (currently owned/leased)` == 1 ~ 2008,
								 `[207796]: 2007: Net Any Vehicle (currently owned/leased)` == 1 ~ 2007,
								 `[207797]: 2006: Net Any Vehicle (currently owned/leased)` == 1 ~ 2006,
								 `[207798]: 2005: Net Any Vehicle (currently owned/leased)` == 1 ~ 2005,
								 `[207799]: 2004: Net Any Vehicle (currently owned/leased)` == 1 ~ 2004,
								 `[207790]: 2003: Net Any Vehicle (currently owned/leased)` == 1 ~ 2003,
								 `[20779x]: 2002: Net Any Vehicle (currently owned/leased)` == 1 ~ 2002,
								 `[20779y]: 2001 or earlier: Net Any Vehicle (currently owned/leased)` == 1 ~ 2001)] %>%
	.[,net_worth := case_when(`[72781]: Household Net Worth: 0-49999 Dollars` == 1 ~ 25,
							  `[72782]: 50000-99999 Dollars` == 1 ~ 75,
							  `[72783]: 100000-149999 Dollars` == 1 ~ 125,
							  `[72784]: 150000-199999 Dollars` == 1 ~ 175,
							  `[72785]: 200000-249999 Dollars` == 1 ~ 225,
							  `[72786]: 250000-299999 Dollars` == 1 ~ 275,
							  `[72787]: 300000-349999 Dollars` == 1 ~ 325,
							  `[72788]: 350000-399999 Dollars` == 1 ~ 375,
							  `[72789]: 400000-499999 Dollars` == 1 ~ 450,
							  `[72780]: 500000-749999 Dollars` == 1 ~ 625,
							  `[7278x]: 750000-999999 Dollars` == 1 ~ 875,
							  `[7278y]: 1000000+ Dollars` == 1 ~ 1250)] %>%
	.[,income := case_when(`[08191]: Household Income: 0-4999 Dollars` == 1 ~ 2.5,
							`[08192]: 5000-9999 Dollars` == 1 ~ 7.5,
							`[08193]: 10000-14999 Dollars` == 1 ~ 12.5,
							`[08194]: 15000-19999 Dollars` == 1 ~ 17.5,
							`[08195]: 20000-24999 Dollars` == 1 ~ 22.5,
							`[08196]: 25000-29999 Dollars` == 1 ~ 27.5,
							`[08197]: 30000-34999 Dollars` == 1 ~ 32.5,
							`[08198]: 35000-39999 Dollars` == 1 ~ 37.5,
							`[08199]: 40000-44999 Dollars` == 1 ~ 42.5,
							`[08190]: 45000-49999 Dollars` == 1 ~ 47.5,
							`[0819x]: 50000-59999 Dollars` == 1 ~ 55,
							`[0819y]: 60000-74999 Dollars` == 1 ~ 62.5,
							`[08201]: 75000-99999 Dollars` == 1 ~ 87.5,
							`[08202]: 100000-149999 Dollars` == 1 ~ 125,
							`[08204]: 150000-199999 Dollars` == 1 ~ 175,
							`[08206]: 200000-249999 Dollars` == 1 ~ 225,
							`[08207]: 250000+ Dollars` == 1 ~ 300)] %>%
	.[,age:= case_when(`[04311]: Respondent - Age: 18 Years` == 1 ~ 18,
						`[04312]: 19 Years` == 1 ~ 19,
						`[04313]: 20 Years` == 1 ~ 20,
						`[04314]: 21 Years` == 1 ~ 21,
						`[04315]: 22-24 Years` == 1 ~ 23,
						`[04316]: 25-29 Years` == 1 ~ 27,
						`[04317]: 30-34 Years` == 1 ~ 32,
						`[04318]: 35-39 Years` == 1 ~ 37,
						`[04319]: 40-44 Years` == 1 ~ 42,
						`[04310]: 45-49 Years` == 1 ~ 47,
						`[0431x]: 50-54 Years` == 1 ~ 52,
						`[0431y]: 55-59 Years` == 1 ~ 57,
						`[04321]: 60-64 Years` == 1 ~ 62,
						`[04322]: 65-69 Years` == 1 ~ 67,
						`[04323]: 70-74 Years` == 1 ~ 72,
						`[04324]: 75+ Years` == 1 ~ 80)] %>%
	.[,college := case_when(`[05755]: Bachelor's degree` == 1 ~ 1, 
						   `[05756]: Post-graduate degree` == 1 ~ 1,
						   `[05753]: Some college, no degree` == 1 ~ 1,
						   TRUE ~ 0)] %>%
	.[,male:=`[04481]: HOH - Sex: Male` ] %>%
	.[,dem:= `[240106]: Political Outlook/Affiliation & Voting - Political Party, If Any, Affiliated With: Democratic`] %>%
	.[,rep:= `[240107]: Republican`]


gfk_table <- gfk[`[76451]: HH subscribes to cable`==1, map(.SD, weighted.mean, w=wgtpop, na.rm=T), .SDcols = setdiff(demos_to_test, c("n_generations", "sf_home"))]
gfk_table[,source:="GfK"]
demo_table[,source:="FWM"]

comparo_table <- melt(rbind(demo_table, gfk_table, fill = T), id.vars = c("source")) %>%
	dcast(variable ~ source) %>%
	.[!is.na(FWM) & !is.na(GfK) & ! variable == "male"] %>%
	.[,variable := factor(variable, levels=c("age","married", "hh_size", "n_adults", "n_children", "has_children",
										   "own_home", "home_value",
										   "n_cars","vehicle_year",
										   "income","net_worth",
										   "college",
										   "white","black","hispanic","asian",
										   "rep","dem"))] %>%
	.[,variable := variable %>% 
					str_replace(fixed("_"), " ") %>%
					str_replace("^n ", "No. ") %>%
					str_to_title %>%
					recode(`Sf Home`="Home Size (SF)", `Hh Size` = "HH Size", Rep = "Republican", Dem = "Democrat")] %>%
	.[,group := c(rep("HH Composition",6), rep("Housing", 2), rep("Auto", 2), rep("Income / Wealth", 2), "Education", rep("Race", 4), rep("Party ID", 2))]

library(gt)

comparo_table %>% gt(rowname_col = "variable") %>% 
	tab_row_group(group = "Party ID", group == "Party ID") %>%
	tab_row_group(group = "Race", group == "Race") %>%
	tab_row_group(group = "Education", group == "Education") %>% 
	tab_row_group(group = "Income / Wealth", group == "Income / Wealth") %>% 
	tab_row_group(group = "Auto", group == "Auto") %>% 
	tab_row_group(group = "Housing", group == "Housing") %>% 
	tab_row_group(group = "HH Composition", group == "HH Composition") %>%
	tab_spanner(label = "Source", columns = vars(FWM, GfK)) %>%
	cols_hide(vars(group)) %>%
	fmt_number(vars(FWM, GfK), decimals = 2, drop_trailing_zeros=2) %>%
	tab_source_note("Notes: Figures for FWM are simple averages of all households in tracking data. For GfK, respondents are weighted by GfK's inverse sampling weights. Income, wealth, and housing values use different bins and top-coded values in GfK and FWM data and hence some difference may be attributable to binning choices. Party ID in FWM data is a combination of survey response and party registration in partisan registration states, with registration preferred where it is available. In GfK it is based on survey response only.") %>%
	as_latex() %>%
	cat(file = paste0(tables_dir, "demo_comparison.tex"))
	
	
	
	
	
	

# function to extract regression estimates of treatment "effect"
# and differences in means across treatment / control groups

get_demo_diffs <- function(t_c, d) {
	# get t / c units demo data
	tcdemo <- d[t_c, on = "device_id", nomatch=0]
	tcdemo[,treated := as.numeric(group=="T")]

	if(sum(tcdemo$group=="T") == 0 | sum(tcdemo$group%in%c("C1","C2","C1+C2")) == 0) return(data.table(variable=demos_to_test, t_eff_est=NA, t_eff_sd=NA, t_eff_p=NA, t_mean=NA, c_mean=NA, t_count=0, c_count=0))

	# for each outcome, regress on T indicator
	regs <- demos_to_test %>% 
		map(possibly(~ lm(as.formula(paste(., "treated", sep = " ~ ")), data=tcdemo),
					 otherwise=lm(0 ~ 0)))  # catch lm failures if there are no non-NA cases in T / C group
	reg_results <- regs %>% 
		imap(~ tidy(.x) %>% mutate(variable=.y)) %>%
		bind_rows %>% 
		filter(term == "treated")


	# mean by t/c group for each
	avgs <- tcdemo[,map(.SD, ~ mean(., na.rm=T)), by=.(treated), .SDcols=demos_to_test] %>%
		gather(key = "variable", value = "mean", -treated) %>% 
		spread(key = treated, value = mean) %>%
		rename(t_mean=`1`, c_mean=`0`)

	avg_count <- tcdemo[,map(.SD, ~ sum(!is.na(.))), by=.(treated), .SDcols=demos_to_test] %>% 
		gather(key = "variable", value = "count", -treated) %>% 
		spread(key = treated, value = count)%>%
		rename(t_count=`1`, c_count=`0`)

	reg_results %>% 
		inner_join(avgs, by = "variable") %>% 
		inner_join(avg_count, by = "variable") %>%
		select(variable, t_eff_est=estimate, t_eff_sd=std.error, t_eff_p = p.value, t_mean, c_mean, t_count, c_count)
}

# day loop
balance_day <- function(date) {
	cat(as.character(date), "\n")

	# load treatment / control group assignment (by device)
	t_c_df <- readRDS(paste0(data_dir, "dd/tc_groups_", as.character(date), ".rds")) %>%
		filter(map_int(t_c, nrow) > 0)

	# load ref data for device to hh matching
	### THESE ARE PROPRIETARY FILES (FROM FWM DATA)
	ref <- readRDS(paste0("stb/ref_data/stb_ref_data_", as.character(date), ".rds"))
	###

	# join
	device_demo <- inner_join(ref, demo, by = "household_id") %>% as.data.table

	# loop over ads: for each ad, pull T/C group
	out <- t_c_df %>% 
		mutate(balance = map(t_c_df$t_c, get_demo_diffs, d = device_demo)) %>% 
		select(-t_c) %>%
		unnest(cols=balance)


	out %>% saveRDS(paste0(data_dir, "balance/ads_dd_balance_", as.character(date), ".rds"))

	out 
}

dates <- seq(mdy("09/01/2012"), mdy("11/06/2012"), by = "days")


# balance_all <- dates %>% map_dfr(balance_day)

# save(balance_all, file=paste0(data_dir, "balance/ads_dd_balance_all.RData"))
load(paste0(data_dir, "balance/ads_dd_balance_all.RData"))

### analysis and plots ###

theme_set(theme_few())
# 1. distribution of p-values by variable

# prettify labels
balance_all$variable <- balance_all$variable %>%
	factor(levels=c("age","male", "married", "hh_size", "n_adults", "n_children", "n_generations", "has_children",
				   "own_home", "home_value", "sf_home",
				   "n_cars","vehicle_year",
				   "income","net_worth",
				   "college",
				   "white","black","hispanic","asian",
				   "rep","dem")) %>%
	str_replace(fixed("_"), " ") %>%
	str_replace("^n ", "No. ") %>%
	str_to_title %>%
	recode(`Sf Home`="Home Size (SF)", `Hh Size` = "HH Size", Rep = "Republican", Dem = "Democrat")

### FIGURE B.2
p_val_dens <- ggplot(data=balance_all, aes(x=t_eff_p)) + 
				geom_density() + 
				facet_wrap(~ variable) +
				xlab("Treatment Effect p-value") +
				ylab("Density")
ggsave(plot=p_val_dens,filename=paste0(plot_dir, "~balance_p_val_dist.pdf"), height=7, width=10)

# 2. distribution of coefficient estimates by variable
balance_all <- balance_all %>% mutate(coef_wgt = (c_count * t_count) / (t_count + c_count))
t_eff_means <- balance_all %>%
	group_by(variable) %>%
	summarize(t_eff_mean = mean(t_eff_est, na.rm=T),
			  t_eff_wmean = weighted.mean(t_eff_est, w=coef_wgt, na.rm=T))

### FIGURE B.1
t_eff_dens <- ggplot(aes(x=t_eff_est), data=balance_all) + 
		geom_density() + 
		facet_wrap(~ variable, scales="free") +
		geom_vline(aes(xintercept =t_eff_wmean), data= t_eff_means, lty="dashed") +
		xlab("Treatment 'Effects' on Demographic Variables") +
		ylab("Density") 

ggsave(t_eff_dens, filename=paste0(plot_dir, "balance_t_eff_dist.pdf"), height=7, width=10)


